mkdir -p results
# Make the blastdbs - they are output in same folder as the fasta input
conda activate MCHelper
while IFS=',' read -r species strain marta dean; do
makeblastdb -in ${marta} -dbtype "nucl"
done < <(sed '1d' curated_libs.csv)
# Run BLASTS - reciprocal not needed
while IFS=',' read -r species strain marta dean; do
blastn -query ${dean} -db ${marta} -outfmt 6 -max_hsps 1 -out results/${species}_${strain}_dean_vs_marta.blast.out
done < <(sed '1d' curated_libs.csv)Manual curation trials
Comparing results of different TEammo curators
Run BLASTs between curated libraries
Visualisation
if(!require("BiocManager", character.only = T)) install.packages("BiocManager")
pkgs.list <- readLines("r-requirements.txt")
for (i in 1:length(pkgs.list)){
if(!require(pkgs.list[i], character.only = T)){
BiocManager::install(pkgs.list[i], Ncpus = ceiling(parallel::detectCores()/1.7))
require(pkgs.list[i], character.only = TRUE)
}else{require(pkgs.list[1],character.only = TRUE)}
}Load and overall comparisons
setwd("/home/csic/gcy/jgp/extra_storage/dean/mctrials/mctrials")
source("scripts/mccompare_functions.R")
# Color schemes
palette_seq_match <- c(
"Missing from query lib" = "black", # Clear and bold
"Missing from subject lib" = "grey50", # Distinct but neutral
"Present, 70" = RColorBrewer::brewer.pal(1, "Greens")[1], # Light green
"Present, 80" = RColorBrewer::brewer.pal(2, "Greens")[2], # Medium green
"Perfect, 95-100" = RColorBrewer::brewer.pal(3, "Greens")[3] # Bright green
)
# Load the TE classifications
teClassification <- read.table("orozco_classification-2024_mchelper.csv", sep = ";", header = TRUE)
teClassification <- teClassification %>%
mutate(AppName = paste(Class, Order, Superfamily, sep ="/") %>% sub("/$", "", .) %>% sub("/$", "", .))
teClassification <- rbind(
teClassification
, c("", "Unclassified", "", "Unclassified")
)
# Load libraries to compare
# Drosophila birchii
lib_Dbir_marta <- teReadLib(
"../../data/final_libraries/D.birchii/D.birchii.TE_library.fasta",
libIdentifier = "Marta",
species = "D.birchii",
strain = "v1.0_00_GenBank")
lib_Dbir_dean <- teReadLib(
"data/MCH_output/D.birchii/v1.0_00_GenBank/D.birchii_v1.0_00_GenBank_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.birchii",
strain = "v1.0_00_GenBank")
# Drosophila merina
lib_Dmer_marta <- teReadLib(
"../../data/final_libraries/D.merina/D.merina.TE_library.fasta",
libIdentifier = "Marta",
species = "D.merina", strain = "NA")
lib_Dmer_dean <- teReadLib(
"data/MCH_output/D.merina/NA/D.merina_NA_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.merina", strain = "NA")
# Drosophila neocordata
lib_Dneo_marta <- teReadLib(
"../../data/final_libraries/D.neocordata/D.neocordata.TE_library.fasta",
libIdentifier = "Marta",
species = "D.neocordata", strain = "nanopore_00")
lib_Dneo_dean <- teReadLib(
"data/MCH_output/D.neocordata/nanopore_00/D.neocordata_nanopore_00_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.neocordata", strain = "nanopore_00")
# Drosophila planitibia
lib_Dpla_marta <- teReadLib(
"../../data/final_libraries/D.planitibia/D.planitibia.TE_library.fasta",
libIdentifier = "Marta",
species = "D.planitibia", strain = "NA2")
lib_Dpla_dean <- teReadLib(
"data/MCH_output/D.planitibia/NA2/D.planitibia_NA2_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.planitibia", strain = "NA2")
# Drosophila santomea
lib_Dsan_marta <- teReadLib(
"../../data/final_libraries/D.santomea/D.santomea.TE_library.fasta",
libIdentifier = "Marta",
species = "D.santomea",
strain = "STO_CAGO_1482_RefSeq")
lib_Dsan_dean <- teReadLib(
"data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/D.santomea_STO_CAGO_1482_RefSeq_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.santomea",
strain = "STO_CAGO_1482_RefSeq")
# Drosophila tristis
lib_Dtri_marta <- teReadLib(
"../../data/final_libraries/D.tristis/D.tristis.TE_library.fasta",
libIdentifier = "Marta",
species = "D.tristis", strain = "nanopore_D2")
lib_Dtri_dean <- teReadLib(
"data/MCH_output/D.tristis/nanopore_D2/D.tristis_nanopore_D2_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.tristis", strain = "nanopore_D2")# Load the blast results comparing curated libraries
blast_Dbir_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.birchii_v1.0_00_GenBank_dean_vs_marta.blast.out",
blast_query_lib = lib_Dbir_dean,
blast_subject_lib = lib_Dbir_marta,
species = "D.birchii",
strain = "v1.0_00_GenBank",
comparison = "dean_vs_marta")
blast_Dmer_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.merina_NA_dean_vs_marta.blast.out",
blast_query_lib = lib_Dmer_dean,
blast_subject_lib = lib_Dmer_marta,
species = "D.merina",
strain = "NA",
comparison = "dean_vs_marta")
blast_Dneo_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.neocordata_nanopore_00_dean_vs_marta.blast.out",
blast_query_lib = lib_Dneo_dean,
blast_subject_lib = lib_Dneo_marta,
species = "D.neocordata",
strain = "nanopore_00",
comparison = "dean_vs_marta")
blast_Dpla_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.planitibia_NA2_dean_vs_marta.blast.out",
blast_query_lib = lib_Dpla_dean,
blast_subject_lib = lib_Dpla_marta,
species = "D.planitibia",
strain = "NA2",
comparison = "dean_vs_marta")
blast_Dsan_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.santomea_STO_CAGO_1482_RefSeq_dean_vs_marta.blast.out",
blast_query_lib = lib_Dsan_dean,
blast_subject_lib = lib_Dsan_marta,
species = "D.santomea",
strain = "STO_CAGO_1482_RefSeq",
comparison = "dean_vs_marta")
blast_Dtri_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.tristis_nanopore_D2_dean_vs_marta.blast.out",
blast_query_lib = lib_Dtri_dean,
blast_subject_lib = lib_Dtri_marta,
species = "D.tristis",
strain = "nanopore_D2",
comparison = "dean_vs_marta")
# Bring them all together for overall
blast_all <- bind_rows(
blast_Dbir_dean_vs_marta,
blast_Dmer_dean_vs_marta,
blast_Dneo_dean_vs_marta,
blast_Dpla_dean_vs_marta,
blast_Dsan_dean_vs_marta,
blast_Dtri_dean_vs_marta
)How do the libraries compare overall?
loadPlotLibs(species = "D.birchii", strain = "v1.0_00_GenBank")Loading TE library: ./data/MCH_output/D.birchii/v1.0_00_GenBank/v1.0_00_GenBank-clean_families.fa
./data/MCH_output/D.birchii/v1.0_00_GenBank/v1.0_00_GenBank-clean_families.fa successfully loaded
Loading TE library: ./data/MCH_output/D.birchii/v1.0_00_GenBank/curated_sequences_NR.fa
./data/MCH_output/D.birchii/v1.0_00_GenBank/curated_sequences_NR.fa successfully loaded
Loading TE library: ./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/complete_models.fa
./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/complete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/incomplete_models.fa
./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/incomplete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/complete_808080.fa
./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/complete_808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/complete_non808080.fa
./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/complete_non808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/complete_808080_recovered.fa
./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/complete_808080_recovered.fa successfully loaded
/standby_sequences.fa does not exist
/standby_707070.fa does not exist
Loading TE library: ./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/incomplete_808080_recovered.fa
./data/MCH_output/D.birchii/v1.0_00_GenBank/1_MI_MCH/incomplete_808080_recovered.fa successfully loaded
/standby_707070_filtered.fa does not exist
Loading TE library: ./data/MCH_output/D.birchii/v1.0_00_GenBank/MCH_final/final_curated_NR.fa
./data/MCH_output/D.birchii/v1.0_00_GenBank/MCH_final/final_curated_NR.fa successfully loaded
There are empty libraries
loadPlotLibs(species = "D.merina", strain = "NA")Loading TE library: ./data/MCH_output/D.merina/NA/NA-clean_families.fa
./data/MCH_output/D.merina/NA/NA-clean_families.fa successfully loaded
Loading TE library: ./data/MCH_output/D.merina/NA/curated_sequences_NR.fa
./data/MCH_output/D.merina/NA/curated_sequences_NR.fa successfully loaded
Loading TE library: ./data/MCH_output/D.merina/NA/1_MI_MCH/complete_models.fa
./data/MCH_output/D.merina/NA/1_MI_MCH/complete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.merina/NA/1_MI_MCH/incomplete_models.fa
./data/MCH_output/D.merina/NA/1_MI_MCH/incomplete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.merina/NA/1_MI_MCH/complete_808080.fa
./data/MCH_output/D.merina/NA/1_MI_MCH/complete_808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.merina/NA/1_MI_MCH/complete_non808080.fa
./data/MCH_output/D.merina/NA/1_MI_MCH/complete_non808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.merina/NA/1_MI_MCH/complete_808080_recovered.fa
./data/MCH_output/D.merina/NA/1_MI_MCH/complete_808080_recovered.fa successfully loaded
/standby_sequences.fa does not exist
/standby_707070.fa does not exist
Loading TE library: ./data/MCH_output/D.merina/NA/1_MI_MCH/incomplete_808080_recovered.fa
./data/MCH_output/D.merina/NA/1_MI_MCH/incomplete_808080_recovered.fa successfully loaded
/standby_707070_filtered.fa does not exist
Loading TE library: ./data/MCH_output/D.merina/NA/MCH_final/final_curated_NR.fa
./data/MCH_output/D.merina/NA/MCH_final/final_curated_NR.fa successfully loaded
There are empty libraries
loadPlotLibs(species = "D.neocordata", strain = "nanopore_00")Loading TE library: ./data/MCH_output/D.neocordata/nanopore_00/nanopore_00-clean_families.fa
./data/MCH_output/D.neocordata/nanopore_00/nanopore_00-clean_families.fa successfully loaded
Loading TE library: ./data/MCH_output/D.neocordata/nanopore_00/curated_sequences_NR.fa
./data/MCH_output/D.neocordata/nanopore_00/curated_sequences_NR.fa successfully loaded
Loading TE library: ./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/complete_models.fa
./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/complete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/incomplete_models.fa
./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/incomplete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/complete_808080.fa
./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/complete_808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/complete_non808080.fa
./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/complete_non808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/complete_808080_recovered.fa
./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/complete_808080_recovered.fa successfully loaded
/standby_sequences.fa does not exist
/standby_707070.fa does not exist
Loading TE library: ./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/incomplete_808080_recovered.fa
./data/MCH_output/D.neocordata/nanopore_00/1_MI_MCH/incomplete_808080_recovered.fa successfully loaded
/standby_707070_filtered.fa does not exist
Loading TE library: ./data/MCH_output/D.neocordata/nanopore_00/MCH_final/final_curated_NR.fa
./data/MCH_output/D.neocordata/nanopore_00/MCH_final/final_curated_NR.fa successfully loaded
There are empty libraries
loadPlotLibs(species = "D.planitibia", strain = "NA2")Loading TE library: ./data/MCH_output/D.planitibia/NA2/NA2-clean_families.fa
./data/MCH_output/D.planitibia/NA2/NA2-clean_families.fa successfully loaded
Loading TE library: ./data/MCH_output/D.planitibia/NA2/curated_sequences_NR.fa
./data/MCH_output/D.planitibia/NA2/curated_sequences_NR.fa successfully loaded
Loading TE library: ./data/MCH_output/D.planitibia/NA2/1_MI_MCH/complete_models.fa
./data/MCH_output/D.planitibia/NA2/1_MI_MCH/complete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.planitibia/NA2/1_MI_MCH/incomplete_models.fa
./data/MCH_output/D.planitibia/NA2/1_MI_MCH/incomplete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.planitibia/NA2/1_MI_MCH/complete_808080.fa
./data/MCH_output/D.planitibia/NA2/1_MI_MCH/complete_808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.planitibia/NA2/1_MI_MCH/complete_non808080.fa
./data/MCH_output/D.planitibia/NA2/1_MI_MCH/complete_non808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.planitibia/NA2/1_MI_MCH/complete_808080_recovered.fa
./data/MCH_output/D.planitibia/NA2/1_MI_MCH/complete_808080_recovered.fa successfully loaded
/standby_sequences.fa does not exist
/standby_707070.fa does not exist
Loading TE library: ./data/MCH_output/D.planitibia/NA2/1_MI_MCH/incomplete_808080_recovered.fa
./data/MCH_output/D.planitibia/NA2/1_MI_MCH/incomplete_808080_recovered.fa successfully loaded
/standby_707070_filtered.fa does not exist
Loading TE library: ./data/MCH_output/D.planitibia/NA2/MCH_final/final_curated_NR.fa
./data/MCH_output/D.planitibia/NA2/MCH_final/final_curated_NR.fa successfully loaded
There are empty libraries
loadPlotLibs(species = "D.santomea", strain = "STO_CAGO_1482_RefSeq")Loading TE library: ./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/STO_CAGO_1482_RefSeq-clean_families.fa
./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/STO_CAGO_1482_RefSeq-clean_families.fa successfully loaded
Loading TE library: ./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/curated_sequences_NR.fa
./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/curated_sequences_NR.fa successfully loaded
Loading TE library: ./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/complete_models.fa
./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/complete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/incomplete_models.fa
./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/incomplete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/complete_808080.fa
./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/complete_808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/complete_non808080.fa
./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/complete_non808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/complete_808080_recovered.fa
./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/complete_808080_recovered.fa successfully loaded
/standby_sequences.fa does not exist
/standby_707070.fa does not exist
Loading TE library: ./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/incomplete_808080_recovered.fa
./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/1_MI_MCH/incomplete_808080_recovered.fa successfully loaded
/standby_707070_filtered.fa does not exist
Loading TE library: ./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/MCH_final/final_curated_NR.fa
./data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/MCH_final/final_curated_NR.fa successfully loaded
There are empty libraries
loadPlotLibs(species = "D.tristis", strain = "nanopore_D2")Loading TE library: ./data/MCH_output/D.tristis/nanopore_D2/nanopore_D2-clean_families.fa
./data/MCH_output/D.tristis/nanopore_D2/nanopore_D2-clean_families.fa successfully loaded
Loading TE library: ./data/MCH_output/D.tristis/nanopore_D2/curated_sequences_NR.fa
./data/MCH_output/D.tristis/nanopore_D2/curated_sequences_NR.fa successfully loaded
Loading TE library: ./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/complete_models.fa
./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/complete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/incomplete_models.fa
./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/incomplete_models.fa successfully loaded
Loading TE library: ./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/complete_808080.fa
./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/complete_808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/complete_non808080.fa
./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/complete_non808080.fa successfully loaded
Loading TE library: ./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/complete_808080_recovered.fa
./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/complete_808080_recovered.fa successfully loaded
/standby_sequences.fa does not exist
/standby_707070.fa does not exist
Loading TE library: ./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/incomplete_808080_recovered.fa
./data/MCH_output/D.tristis/nanopore_D2/1_MI_MCH/incomplete_808080_recovered.fa successfully loaded
/standby_707070_filtered.fa does not exist
Loading TE library: ./data/MCH_output/D.tristis/nanopore_D2/MCH_final/final_curated_NR.fa
./data/MCH_output/D.tristis/nanopore_D2/MCH_final/final_curated_NR.fa successfully loaded
There are empty libraries
tePlotLib(list(lib_Dbir_dean, lib_Dbir_marta))tePlotLib(list(lib_Dmer_dean, lib_Dmer_marta))tePlotLib(list(lib_Dpla_dean, lib_Dpla_marta))tePlotLib(list(lib_Dneo_dean, lib_Dneo_marta))tePlotLib(list(lib_Dsan_dean, lib_Dsan_marta))tePlotLib(list(lib_Dtri_dean, lib_Dtri_marta))How often do the curators agree based on BLASTn sequence identity between their final curated libraries?
PlotBlastBarMatches(blast_all)PlotBlastBarMatches(blast_Dbir_dean_vs_marta)PlotBlastBarMatches(blast_Dmer_dean_vs_marta)PlotBlastBarMatches(blast_Dneo_dean_vs_marta)PlotBlastBarMatches(blast_Dpla_dean_vs_marta)PlotBlastBarMatches(blast_Dsan_dean_vs_marta)PlotBlastBarMatches(blast_Dtri_dean_vs_marta)How often do curators agree on TE classifications?
PlotBlastTileMatches(blast_all)PlotBlastTileMatches(blast_Dbir_dean_vs_marta)PlotBlastTileMatches(blast_Dmer_dean_vs_marta)PlotBlastTileMatches(blast_Dneo_dean_vs_marta)PlotBlastTileMatches(blast_Dpla_dean_vs_marta)PlotBlastTileMatches(blast_Dsan_dean_vs_marta)PlotBlastTileMatches(blast_Dtri_dean_vs_marta)TE size distribution
lib1 <- width(lib_Dtri_dean) %>%
data.frame(length = .)
lib1$curator <- "dean"
lib2 <- width(lib_Dtri_marta) %>%
data.frame(length = .)
lib2$curator <- "marta"
bind_rows(lib1, lib2) %>%
ggplot(aes(x = length)) +
geom_density(aes(fill = curator), alpha = 0.5, color = "black") +
labs(title = "Density Plot of DNA Sequence Lengths",
x = "Sequence Length (bp)",
y = "Density") +
theme_minimal()